/*
 *  kernel/sched.c
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *		make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
 *		hybrid priority-list and round-robin design with
 *		an array-switch method of distributing timeslices
 *		and per-CPU runqueues.  Cleanups and useful suggestions
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03	Interactivity tuning by Con Kolivas.
 *  2004-04-02	Scheduler domains code by Nick Piggin
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
 *              by Peter Williams
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
 */
#include <utils/utils.h>
#include <utils/reciprocal_div.h>
#include <seminix/smp.h>
#include <seminix/init.h>
#include <seminix/cpu.h>
#include <seminix/cpumask.h>
#include <seminix/signal.h>
#include <seminix/irq/irq_regs.h>
#include <seminix/wait.h>
#include <seminix/mutex.h>
#include <seminix/completion.h>
#include <seminix/dump_stack.h>
#include <seminix/uaccess.h>
#include <asm-generic/switch_to.h>
#include <asm/mmu_context.h>
#include "sched.h"

int in_sched_functions(unsigned long addr)
{
    return in_lock_functions(addr) ||
        (addr >= (unsigned long)__sched_text_start
        && addr < (unsigned long)__sched_text_end);
}

void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
    unsigned long delta;
    ktime_t soft, hard, now;

    for (;;) {
        if (hrtimer_active(period_timer))
            break;

        now = hrtimer_cb_get_time(period_timer);
        hrtimer_forward(period_timer, now, period);

        soft = hrtimer_get_softexpires(period_timer);
        hard = hrtimer_get_expires(period_timer);
        delta = ktime_to_ns(ktime_sub(hard, soft));
        hrtimer_start_range_ns(period_timer, soft, delta,
                     HRTIMER_MODE_ABS);
    }
}

DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

static inline void update_rq_clock_task(struct rq *rq, s64 delta)
{
    rq->clock_task += delta;
}

void update_rq_clock(struct rq *rq)
{
    s64 delta;

    if (rq->skip_clock_update > 0)
        return;

    delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
    rq->clock += delta;
    update_rq_clock_task(rq, delta);
}

/*
 * __task_rq_lock - lock the rq @p resides on.
 */
static inline struct rq *__task_rq_lock(struct tcb *p)
{
    struct rq *rq;

    for (;;) {
        rq = task_rq(p);
        spin_lock(&rq->lock);
        if (likely(rq == task_rq(p)))
            return rq;
        spin_unlock(&rq->lock);
    }
}

/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
static struct rq *task_rq_lock(struct tcb *p, unsigned long *flags)
{
    struct rq *rq;

    for (;;) {
        raw_spin_lock_irqsave(&p->pi_lock, *flags);
        rq = task_rq(p);
        spin_lock(&rq->lock);
        if (likely(rq == task_rq(p)))
            return rq;
        spin_unlock(&rq->lock);
        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
    }
}

static void __task_rq_unlock(struct rq *rq)
{
    spin_unlock(&rq->lock);
}

static inline void
task_rq_unlock(struct rq *rq, struct tcb *p, unsigned long *flags)
{
    spin_unlock(&rq->lock);
    raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}

/*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
static struct rq *this_rq_lock(void)
{
    struct rq *rq;

    local_irq_disable();
    rq = this_rq();
    spin_lock(&rq->lock);

    return rq;
}

/*
 * Use HR-timers to deliver accurate preemption points.
 */

static void hrtick_clear(struct rq *rq)
{
    if (hrtimer_active(&rq->hrtick_timer))
        hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
    struct rq *rq = container_of(timer, struct rq, hrtick_timer);

    WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

    spin_lock(&rq->lock);
    update_rq_clock(rq);
    rq->curr->sched_class->task_tick(rq, rq->curr, 1);
    spin_unlock(&rq->lock);

    return HRTIMER_NORESTART;
}

#ifdef CONFIG_SMP

static void __hrtick_restart(struct rq *rq)
{
    struct hrtimer *timer = &rq->hrtick_timer;
    ktime_t time = hrtimer_get_softexpires(timer);

    hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS);
}

/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
{
    struct rq *rq = arg;

    spin_lock(&rq->lock);
    __hrtick_restart(rq);
    rq->hrtick_csd_pending = 0;
    spin_unlock(&rq->lock);
}

/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
void hrtick_start(struct rq *rq, u64 delay)
{
    struct hrtimer *timer = &rq->hrtick_timer;
    ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

    hrtimer_set_expires(timer, time);

    if (rq == this_rq()) {
        __hrtick_restart(rq);
    } else if (!rq->hrtick_csd_pending) {
        smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
        rq->hrtick_csd_pending = 1;
    }
}
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
void hrtick_start(struct rq *rq, u64 delay)
{
    hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
            HRTIMER_MODE_REL);
}

#endif

static void init_rq_hrtick(struct rq *rq)
{
#ifdef CONFIG_SMP
    rq->hrtick_csd_pending = 0;

    rq->hrtick_csd.flags = 0;
    rq->hrtick_csd.func = __hrtick_start;
    rq->hrtick_csd.info = rq;
#endif

    hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    rq->hrtick_timer.function = hrtick;
}

/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
void resched_task(struct tcb *p)
{
    int cpu;

    if (test_tsk_need_resched(p))
        return;

    set_tsk_need_resched(p);

    cpu = task_cpu(p);
    if (cpu == smp_processor_id()) {
        set_preempt_need_resched();
        return;
    }

    /* NEED_RESCHED must be visible before we test polling */
    smp_mb();
    if (!tsk_is_polling(p))
        smp_send_reschedule(cpu);
}

void resched_cpu(int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long flags;

    if (!spin_trylock_irqsave(&rq->lock, flags))
        return;
    resched_task(cpu_curr(cpu));
    spin_unlock_irqrestore(&rq->lock, flags);
}

#ifdef CONFIG_SMP
bool sched_can_stop_tick(void)
{
       struct rq *rq;

       rq = this_rq();

       /* Make sure rq->nr_running update is visible after the IPI */
       smp_rmb();

       /* More than one running task need preemption */
       if (rq->nr_running > 1)
               return false;

       return true;
}
#endif

/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
 */
static const u32 prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

/*
 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
static const u32 prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

static void set_load_weight(struct tcb *p)
{
    int prio;
    struct load_weight *load = &p->se.load;

    if (task_has_dl_policy(p)) {
        load->weight = scale_load(prio_to_weight[0]) * 4;
        load->inv_weight = prio_to_wmult[0] >> 1;
        return;
    }

    if (task_has_rt_policy(p)) {
        load->weight = scale_load(prio_to_weight[0]) * 2;
        load->inv_weight = prio_to_wmult[0] >> 1;
        return;
    }

    /*
     * SCHED_IDLE tasks get minimal weight:
     */
    if (p->policy == SEMINIX_SCHED_IDLE) {
        load->weight = scale_load(WEIGHT_IDLEPRIO);
        load->inv_weight = WMULT_IDLEPRIO;
        return;
    }

    prio = p->prio - MAX_RT_PRIO;
    load->weight = scale_load(prio_to_weight[prio]);
    load->inv_weight = prio_to_wmult[prio];
}

inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
    lw->weight += inc;
    lw->inv_weight = 0;
}

inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
    lw->weight -= dec;
    lw->inv_weight = 0;
}

static inline void inc_load(struct rq *rq, const struct tcb *p)
{
    update_load_add(&rq->load, p->se.load.weight);
}

static inline void dec_load(struct rq *rq, const struct tcb *p)
{
    update_load_sub(&rq->load, p->se.load.weight);
}

static inline void inc_nr_running(struct tcb *p, struct rq *rq)
{
    rq->nr_running++;
    inc_load(rq, p);
#ifdef CONFIG_SMP
    if (rq->nr_running == 2) {
        if (tick_nohz_full_cpu(rq->cpu)) {
            /* Order rq->nr_running write against the IPI */
            smp_wmb();
            smp_send_reschedule(rq->cpu);
        }
    }
#endif
}

static inline void dec_nr_running(struct tcb *p, struct rq *rq)
{
    rq->nr_running--;
    dec_load(rq, p);
}

static void enqueue_task(struct rq *rq, struct tcb *p, int flags)
{
    update_rq_clock(rq);
    p->sched_class->enqueue_task(rq, p, flags);
}

static void dequeue_task(struct rq *rq, struct tcb *p, int flags)
{
    update_rq_clock(rq);
    p->sched_class->dequeue_task(rq, p, flags);
}

static void activate_task(struct rq *rq, struct tcb *p, int flags)
{
    if (p->state & TASK_UNINTERRUPTIBLE)
        rq->nr_uninterruptible--;

    enqueue_task(rq, p, flags);
    inc_nr_running(p, rq);
}

static void deactivate_task(struct rq *rq, struct tcb *p, int flags)
{
    if (p->state & TASK_UNINTERRUPTIBLE)
        rq->nr_uninterruptible++;

    dequeue_task(rq, p, flags);
    dec_nr_running(p, rq);
}

void sched_set_stop_task(int cpu, struct tcb *stop)
{
    struct tcb *old_stop = cpu_rq(cpu)->stop;

    if (stop) {
        /*
         * Make it appear like a SCHED_FIFO task, its something
         * userspace knows about and won't get confused about.
         *
         * Also, it will make PI more or less work without too
         * much confusion -- but then, stop work should not
         * rely on PI working anyway.
         */
        sched_set_scheduler(stop, stop->policy, stop->prio);

        stop->sched_class = &stop_sched_class;
    }

    cpu_rq(cpu)->stop = stop;

    if (old_stop) {
        /*
         * Reset it back to a normal scheduling class so that
         * it can die in pieces.
         */
        old_stop->sched_class = &rt_sched_class;
    }
}

/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
 */
inline int task_curr(const struct tcb *p)
{
    return cpu_curr(task_cpu(p)) == p;
}

static inline void check_class_changed(struct rq *rq, struct tcb *p,
                       const struct sched_class *prev_class,
                       int oldprio)
{
    if (prev_class != p->sched_class) {
        if (prev_class->switched_from)
            prev_class->switched_from(rq, p);
        p->sched_class->switched_to(rq, p);
    } else if (oldprio != p->prio || dl_task(p))
        p->sched_class->prio_changed(rq, p, oldprio);
}

void check_preempt_curr(struct rq *rq, struct tcb *p, int flags)
{
    const struct sched_class *class;

    if (p->sched_class == rq->curr->sched_class) {
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
    } else {
        for_each_class(class) {
            if (class == rq->curr->sched_class)
                break;
            if (class == p->sched_class) {
                resched_task(rq->curr);
                break;
            }
        }
    }

    /*
     * A queue event has occurred, and we're going to schedule.  In
     * this case, we can save a useless back to back clock update.
     */
    if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
        rq->skip_clock_update = 1;
}

#ifdef CONFIG_SMP
static inline void __set_task_cpu(struct tcb *p, int cpu)
{
    p->cpu = cpu;
}

/***
 * kick_process - kick a running thread to enter/exit the kernel
 * @p: the to-be-kicked thread
 *
 * Cause a process which is running on another CPU to enter
 * kernel-mode, without any delay. (to get signals handled.)
 *
 * NOTE: this function doesnt have to take the runqueue lock,
 * because all it wants to ensure is that the remote task enters
 * the kernel. If the IPI races and the task has been migrated
 * to another CPU then no harm is done and the purpose has been
 * achieved as well.
 */
void kick_process(struct tcb *p)
{
    int cpu;

    preempt_disable();
    cpu = task_cpu(p);
    if ((cpu != smp_processor_id()) && task_curr(p))
        smp_send_reschedule(cpu);
    preempt_enable();
}
#else
static inline void __set_task_cpu(struct tcb *p, int cpu) {}
#endif

static void ttwu_activate(struct rq *rq, struct tcb *p, int en_flags)
{
    activate_task(rq, p, en_flags);
    p->on_rq = 1;
}

/*
 * Mark the task runnable and perform wakeup-preemption.
 */
static void
ttwu_do_wakeup(struct rq *rq, struct tcb *p, int wake_flags)
{
    check_preempt_curr(rq, p, wake_flags);

    p->state = TASK_RUNNING;
}

static void
ttwu_do_activate(struct rq *rq, struct tcb *p, int wake_flags)
{
    ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
    ttwu_do_wakeup(rq, p, wake_flags);
}

/*
 * Called in case the task @p isn't fully descheduled from its runqueue,
 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
 * since all we need to do is flip p->state to TASK_RUNNING, since
 * the task is still ->on_rq.
 */
static int ttwu_remote(struct tcb *p, int wake_flags)
{
    struct rq *rq;
    int ret = 0;

    rq = __task_rq_lock(p);
    if (p->on_rq) {
        /* check_preempt_curr() may use rq clock */
        update_rq_clock(rq);
        ttwu_do_wakeup(rq, p, wake_flags);
        ret = 1;
    }
    __task_rq_unlock(rq);

    return ret;
}

#ifdef CONFIG_SMP
void scheduler_ipi(void)
{
    /*
     * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
     * TIF_NEED_RESCHED remotely (for the first time) will also send
     * this IPI.
     */
    preempt_fold_need_resched();
}
#endif

static void ttwu_queue(struct tcb *p, int cpu)
{
    struct rq *rq = cpu_rq(cpu);

    spin_lock(&rq->lock);
    ttwu_do_activate(rq, p, 0);
    spin_unlock(&rq->lock);
}

/**
 * try_to_wake_up - wake up a thread
 * @p: the thread to be awakened
 * @state: the mask of task states that can be woken
 * @wake_flags: wake modifier flags (WF_*)
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
 * re-schedule is in progress), and as such you're allowed to do
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
 * Return: %true if @p was woken up, %false if it was already running.
 * or @state didn't match @p's state.
 */
static int
try_to_wake_up(struct tcb *p, unsigned int state, int wake_flags)
{
    unsigned long flags;
    int cpu, success = 0;

    /*
     * If we are going to wake up a thread waiting for CONDITION we
     * need to ensure that CONDITION=1 done by the caller can not be
     * reordered with p->state check below. This pairs with mb() in
     * set_current_state() the waiting thread does.
     */
    smp_mb__before_spinlock();
    raw_spin_lock_irqsave(&p->pi_lock, flags);
    if (!(p->state & state))
        goto out;

    success = 1; /* we're going to change ->state */
    cpu = task_cpu(p);

    if (p->on_rq && ttwu_remote(p, wake_flags))
        goto out;

#ifdef CONFIG_SMP
    /*
     * If the owning (remote) cpu is still in the middle of schedule() with
     * this task as prev, wait until its done referencing the task.
     */
    while (p->on_cpu)
        cpu_relax();
    /*
     * Pairs with the smp_wmb() in finish_lock_switch().
     */
    smp_rmb();

    if (p->sched_class->task_waking)
        p->sched_class->task_waking(p);
#endif
    ttwu_queue(p, cpu);
out:
    raw_spin_unlock_irqrestore(&p->pi_lock, flags);

    return success;
}

int wake_up_process(struct tcb *p)
{
    return try_to_wake_up(p, TASK_NORMAL, 0);
}

int wake_up_state(struct tcb *p, int state)
{
    return try_to_wake_up(p, state, 0);
}

/*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
 *
 * __sched_new() is basic setup used by init_idle() too:
 */
static void __sched_new(struct tcb *p)
{
    p->on_rq			= 0;

    p->se.on_rq			= 0;
    p->se.exec_start		= 0;
    p->se.sum_exec_runtime		= 0;
    p->se.prev_sum_exec_runtime	= 0;
    p->se.vruntime			= 0;

    RB_CLEAR_NODE(&p->dl.rb_node);
    hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    p->dl.dl_runtime = p->dl.runtime = 0;
    p->dl.dl_deadline = p->dl.deadline = 0;
    p->dl.dl_period = 0;

    INIT_LIST_HEAD(&p->rt.run_list);
}

int sched_new(struct tcb *p, int state)
{
    __sched_new(p);
    /*
     * We mark the process as running here. This guarantees that
     * nobody will actually run it, and a signal or other external
     * event cannot wake it up and insert it on the runqueue either.
     */
    p->state = state;

    if (dl_policy(p->policy)) {
        return -EAGAIN;
    } else if (rt_prio(p->prio)) {
        p->sched_class = &rt_sched_class;
    } else {
        p->sched_class = &fair_sched_class;
    }

    if (p->sched_class->task_new)
        p->sched_class->task_new(p);

    init_task_preempt_count(p);

    return 0;
}

/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct tcb *p)
{
    unsigned long flags;
    struct rq *rq;

    WARN_ON(p->state != TASK_NEW);
    raw_spin_lock_irqsave(&p->pi_lock, flags);

    p->state = TASK_RUNNING;
    rq = __task_rq_lock(p);
    activate_task(rq, p, 0);
    p->on_rq = 1;
    check_preempt_curr(rq, p, WF_FORK);
    task_rq_unlock(rq, p, &flags);
}

unsigned long to_ratio(u64 period, u64 runtime)
{
    if (runtime == RUNTIME_INF)
        return 1ULL << 20;

    /*
     * Doing this here saves a lot of checks in all
     * the calling paths, and returning zero seems
     * safe for them anyway.
     */
    if (period == 0)
        return 0;

    return div64_u64(runtime << 20, period);
}

inline struct dl_bw *dl_bw_of(int i)
{
    return &cpu_rq(i)->dl.dl_bw;
}

static inline int dl_bw_cpus(int i)
{
    return 1;
}

static inline
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
{
    dl_b->total_bw -= tsk_bw;
}

static inline
void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
{
    dl_b->total_bw += tsk_bw;
}

static inline
bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
{
    return (s64)dl_b->bw != -1 &&
           dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}

/*
 * We must be sure that accepting a new task (or allowing changing the
 * parameters of an existing one) is consistent with the bandwidth
 * constraints. If yes, this function also accordingly updates the currently
 * allocated bandwidth to reflect the new situation.
 *
 * This function is called while holding p's rq->lock.
 */
static int dl_overflow(struct tcb *p, int policy,
               const struct sched_attr *attr)
{

    struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
    u64 period = attr->sched_period ?: attr->sched_deadline;
    u64 runtime = attr->sched_runtime;
    u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
    int cpus, err = -1;

    if (new_bw == p->dl.dl_bw)
        return 0;

    /*
     * Either if a task, enters, leave, or stays -deadline but changes
     * its parameters, we may need to update accordingly the total
     * allocated bandwidth of the container.
     */
    raw_spin_lock(&dl_b->lock);
    cpus = dl_bw_cpus(task_cpu(p));
    if (dl_policy(policy) && !task_has_dl_policy(p) &&
        !__dl_overflow(dl_b, cpus, 0, new_bw)) {
        __dl_add(dl_b, new_bw);
        err = 0;
    } else if (dl_policy(policy) && task_has_dl_policy(p) &&
           !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
        __dl_clear(dl_b, p->dl.dl_bw);
        __dl_add(dl_b, new_bw);
        err = 0;
    } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
        __dl_clear(dl_b, p->dl.dl_bw);
        err = 0;
    }
    raw_spin_unlock(&dl_b->lock);

    return err;
}

/**
 * prepare_task_switch - prepare to switch tasks
 * @rq: the runqueue preparing to switch
 * @prev: the current task that is being switched out
 * @next: the task we are going to switch to.
 *
 * This is called with the rq lock held and interrupts off. It must
 * be paired with a subsequent finish_task_switch after the context
 * switch.
 *
 * prepare_task_switch sets up locking and calls architecture specific
 * hooks.
 */
static inline void
prepare_task_switch(struct rq *rq, struct tcb *prev,
            struct tcb *next)
{
    prepare_lock_switch(rq, next);
    prepare_arch_switch(next);
}

/**
 * finish_task_switch - clean up after a task-switch
 * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
 * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 */
static void finish_task_switch(struct rq *rq, struct tcb *prev)
    __releases(rq->lock)
{
    long prev_state;

    /*
     * A task struct has one reference for the use as "current".
     * If a task dies, then it sets TASK_DEAD in tsk->state and calls
     * schedule one last time. The schedule call will never return, and
     * the scheduled task must drop that reference.
     * The test for TASK_DEAD must occur while the runqueue locks are
     * still held, otherwise prev could be scheduled on another cpu, die
     * there before we look at prev->state, and then the reference would
     * be dropped twice.
     *		Manfred Spraul <manfred@colorfullife.com>
     */
    prev_state = prev->state;
    finish_arch_switch(prev);
    finish_lock_switch(rq, prev);
    finish_arch_post_lock_switch();

    if (unlikely(prev_state & TASK_DIE))
        if (prev->sched_class->task_dead)
            prev->sched_class->task_dead(prev);

    tick_nohz_task_switch(current);
}

/**
 * schedule_tail - first thing a freshly forked thread must call.
 * @prev: the thread we just switched away from.
 */
asmlinkage void schedule_tail(struct tcb *prev)
{
    struct rq *rq = this_rq();

    finish_task_switch(rq, prev);
    preempt_enable();

    if (current->tid_address)
        put_user(task_tid_nr(current), current->tid_address);
}

/*
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */
static inline void
context_switch(struct rq *rq, struct tcb *prev,
            struct tcb *next)
{
    struct mm_struct *mm, *oldmm;

    prepare_task_switch(rq, prev, next);
    mm = next->mm;
    oldmm = prev->mm;

    switch_mm(oldmm, mm, next);

    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);

    barrier();
    /*
     * this_rq must be evaluated again because prev may have moved
     * CPUs since it called schedule(), thus the 'rq' on its stack
     * frame will be invalid.
     */
    finish_task_switch(this_rq(), prev);
}

/*
 * nr_running, nr_uninterruptible and nr_context_switches:
 *
 * externally visible scheduler statistics: current number of runnable
 * threads, current number of uninterruptible-sleeping threads, total
 * number of context switches performed since bootup.
 */
u64 nr_running(void)
{
    int i;
    u64 sum = 0;

    for_each_online_cpu(i)
        sum += cpu_rq(i)->nr_running;

    return sum;
}

u64 nr_context_switches(void)
{
    int i;
    u64 sum = 0;

    for_each_online_cpu(i)
        sum += cpu_rq(i)->nr_switches;

    return sum;
}

/*
 * Return any ns on the sched_clock that have not yet been accounted in
 * @p in case that task is currently running.
 *
 * Called with task_rq_lock() held on @rq.
 */
static u64 do_task_delta_exec(struct tcb *p, struct rq *rq)
{
    u64 ns = 0;

    if (task_current(rq, p)) {
        update_rq_clock(rq);
        ns = rq_clock_task(rq) - p->se.exec_start;
        if ((s64)ns < 0)
            ns = 0;
    }

    return ns;
}

unsigned long long task_delta_exec(struct tcb *p)
{
    unsigned long flags;
    struct rq *rq;
    u64 ns = 0;

    rq = task_rq_lock(p, &flags);
    ns = do_task_delta_exec(p, rq);
    task_rq_unlock(rq, p, &flags);

    return ns;
}

/*
 * Return accounted runtime for the task.
 * In case the task is currently running, return the runtime plus current's
 * pending runtime that have not been accounted yet.
 */
unsigned long long task_sched_runtime(struct tcb *p)
{
    unsigned long flags;
    struct rq *rq;
    u64 ns = 0;

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
    /*
     * 64-bit doesn't need locks to atomically read a 64bit value.
     * So we have a optimization chance when the task's delta_exec is 0.
     * Reading ->on_cpu is racy, but this is ok.
     *
     * If we race with it leaving cpu, we'll take a lock. So we're correct.
     * If we race with it entering cpu, unaccounted time is 0. This is
     * indistinguishable from the read occurring a few cycles earlier.
     */
    if (!p->on_cpu)
        return p->se.sum_exec_runtime;
#endif

    rq = task_rq_lock(p, &flags);
    ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
    task_rq_unlock(rq, p, &flags);

    return ns;
}

/*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC).
 */
static void update_cpu_load(struct rq *this_rq)
{
    u32 this_load = this_rq->load.weight;
    int i, scale;

    this_rq->nr_load_updates++;

    /* Update our load: */
    for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
        unsigned long old_load, new_load;

        /* scale is effectively 1 << i now, and >> i divides by scale */

        old_load = this_rq->cpu_load[i];
        new_load = this_load;
        /*
         * Round up the averaging division if load is increasing. This
         * prevents us from getting stuck on 9 if the load is 10, for
         * example.
         */
        if (new_load > old_load)
            new_load += scale - 1;
        this_rq->cpu_load[i] = (old_load*(scale - 1) + new_load) >> i;
    }
}

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 */
void scheduler_tick(void)
{
    int cpu = smp_processor_id();
    struct rq *rq = cpu_rq(cpu);
    struct tcb *curr = rq->curr;

    spin_lock(&rq->lock);
    update_rq_clock(rq);
    curr->sched_class->task_tick(rq, curr, 0);
    rq->last_load_update_tick = jiffies_64;
    update_cpu_load(rq);
    spin_unlock(&rq->lock);
    rq_last_tick_reset(rq);
}

#ifdef CONFIG_SMP
/**
 * scheduler_tick_max_deferment
 *
 * Keep at least one tick per second when a single
 * active task is running because the scheduler doesn't
 * yet completely support full dynticks environment.
 *
 * This makes sure that uptime, CFS vruntime, load
 * balancing, etc... continue to move forward, even
 * with a very low granularity.
 *
 * Return: Maximum deferment in nanoseconds.
 */
u64 scheduler_tick_max_deferment(void)
{
    struct rq *rq = this_rq();
    unsigned long next, now = ACCESS_ONCE(jiffies_64);

    next = rq->last_sched_tick + HZ;

    if (time_before_eq(next, now))
        return 0;

    return jiffies_to_nsecs(next - now);
}
#endif

/*
 * Print scheduling while atomic bug:
 */
static noinline void __schedule_bug(struct tcb *prev)
{
    pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n",
        prev->comm, prev->tid, preempt_count());

    dump_stack();
}

/*
 * Various schedule()-time debugging checks and statistics:
 */
static inline void schedule_debug(struct tcb *prev)
{
    /*
     * Test if we are atomic. Since do_exit() needs to call into
     * schedule() atomically, we ignore that path. Otherwise whine
     * if we are scheduling when we should not.
     */
    if (unlikely(in_atomic_preempt_off() && !(prev->state & TASK_DIE))) {
        __schedule_bug(prev);
        preempt_count_set(PREEMPT_DISABLED);
    }
}

static void put_prev_task(struct rq *rq, struct tcb *prev)
{
    if (prev->on_rq || rq->skip_clock_update < 0)
        update_rq_clock(rq);
    prev->sched_class->put_prev_task(rq, prev);
}

/*
 * Pick up the highest-prio task:
 */
static inline struct tcb *
pick_next_task(struct rq *rq)
{
    const struct sched_class *class;
    struct tcb *p;

    /*
     * Optimization: we know that if all tasks are in
     * the fair class we can call that function directly:
     */
    if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
        p = fair_sched_class.pick_next_task(rq);
        if (likely(p))
            return p;
    }

    for_each_class(class) {
        p = class->pick_next_task(rq);
        if (p)
            return p;
    }

    BUG(); /* the idle class will always have a runnable task */
}

/*
 * schedule() is the main scheduler function.
 */
static void __sched notrace __schedule(bool preempt)
{
    struct tcb *prev, *next;
    u64 *switch_count;
    struct rq *rq;
    int cpu;

    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    prev = rq->curr;
    switch_count = &prev->nivcsw;

    schedule_debug(prev);

    if (sched_feat(HRTICK))
        hrtick_clear(rq);

    /*
     * Make sure that signal_pending_state()->signal_pending() below
     * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
     * done by the caller to avoid the race with signal_wake_up().
     */
    smp_mb__before_spinlock();
    spin_lock_irq(&rq->lock);

    switch_count = &prev->nivcsw;
    if (!preempt && prev->state) {
        if (unlikely(signal_pending_state(prev->state, prev))) {
            prev->state = TASK_RUNNING;
        } else {
            deactivate_task(rq, prev, DEQUEUE_SLEEP);
            prev->on_rq = 0;
        }
        switch_count = &prev->nvcsw;
    }

    put_prev_task(rq, prev);
    next = pick_next_task(rq);
    clear_tsk_need_resched(prev);
    clear_preempt_need_resched();
    rq->skip_clock_update = 0;

    if (likely(prev != next)) {
        rq->nr_switches++;
        rq->curr = next;
        ++*switch_count;

        context_switch(rq, prev, next); /* unlocks the rq */
        /*
         * The context switch have flipped the stack from under us
         * and restored the local variables which were saved when
         * this task called schedule() in the past. prev == current
         * is still correct, but it can be moved to another cpu/rq.
         */
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
    } else
        spin_unlock_irq(&rq->lock);
}

asmlinkage __visible void __sched schedule(void)
{
    do {
        preempt_disable();
        __schedule(false);
        sched_preempt_enable_no_resched();
    } while (need_resched());
}

static void __sched notrace preempt_schedule_common(void)
{
    do {
        preempt_disable();
        __schedule(true);
        preempt_enable_no_resched();

        /*
         * Check again in case we missed a preemption opportunity
         * between schedule and now.
         */
    } while (need_resched());
}

/*
 * this is the entry point to schedule() from in-kernel preemption
 * off of preempt_enable. Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
    /*
     * If there is a non-zero preempt_count or interrupts are disabled,
     * we do not want to preempt the current task. Just return..
     */
    if (likely(!preemptible()))
        return;

    preempt_schedule_common();
}

/*
 * this is the entry point to schedule() from kernel preemption
 * off of irq context.
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
    /* Catch callers which need to be fixed */
    BUG_ON(preempt_count() || !irqs_disabled());

    do {
        preempt_disable();
        local_irq_enable();
        __schedule(true);
        local_irq_disable();
        sched_preempt_enable_no_resched();
    } while (need_resched());
}

/**
 * schedule_preempt_disabled - called with preemption disabled
 *
 * Returns with preemption disabled. Note: preempt_count must be 1
 */
void __sched schedule_preempt_disabled(void)
{
    sched_preempt_enable_no_resched();
    schedule();
    preempt_disable();
}

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
              void *key)
{
    return try_to_wake_up(curr->private, mode, sync);
}

static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
    unsigned long flags;
    wait_queue_t wait;

    init_waitqueue_entry(&wait, current);

    __set_current_state(state);

    spin_lock_irqsave(&q->lock, flags);
    __add_wait_queue(q, &wait);
    spin_unlock(&q->lock);
    timeout = schedule_timeout(timeout);
    spin_lock_irq(&q->lock);
    __remove_wait_queue(q, &wait);
    spin_unlock_irqrestore(&q->lock, flags);

    return timeout;
}

void __sched interruptible_sleep_on(wait_queue_head_t *q)
{
    sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}

long __sched
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
    return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
}

void __sched sleep_on(wait_queue_head_t *q)
{
    sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
    return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
}

/**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
 *
 * Return: 1 if the CPU is currently idle. 0 otherwise.
 */
int idle_cpu(int cpu)
{
    struct rq *rq = cpu_rq(cpu);

    if (rq->curr != rq->idle)
        return 0;

    if (rq->nr_running)
        return 0;

    return 1;
}

/**
 * idle_task - return the idle task for a given cpu.
 * @cpu: the processor in question.
 *
 * Return: The idle task for the cpu @cpu.
 */
struct tcb *idle_task(int cpu)
{
    return cpu_rq(cpu)->idle;
}

void sched_set_user_nice(struct tcb *p, long nice)
{
    int old_prio, delta, on_rq;
    unsigned long flags;
    struct rq *rq;

    if (PRIO_TO_NICE(p->prio) == nice || nice < -20 || nice > 19)
        return;
    /*
     * We have to be careful, if called from sys_setpriority(),
     * the task might be in the middle of scheduling on another CPU.
     */
    rq = task_rq_lock(p, &flags);
    /*
     * The RT priorities are set via sched_set_scheduler(), but we still
     * allow the 'normal' nice value to be set - but as expected
     * it wont have any effect on scheduling until the task is
     * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
     */
    if (task_has_dl_policy(p) || task_has_rt_policy(p))
        BUG();

    on_rq = p->on_rq;
    if (on_rq)
        dequeue_task(rq, p, 0);

    old_prio = p->prio;
    p->prio = NICE_TO_PRIO(nice);
    set_load_weight(p);
    delta = p->prio - old_prio;

    if (on_rq) {
        enqueue_task(rq, p, 0);
        /*
         * If the task increased its priority or is running and
         * lowered its priority, then reschedule its CPU:
         */
        if (delta < 0 || (delta > 0 && task_running(rq, p)))
            resched_task(rq->curr);
    }

    task_rq_unlock(rq, p, &flags);
}

/*
 * This function initializes the sched_dl_entity of a newly becoming
 * SCHED_DEADLINE task.
 *
 * Only the static values are considered here, the actual runtime and the
 * absolute deadline will be properly calculated when the task is enqueued
 * for the first time with its new policy.
 */
static void
__setparam_dl(struct tcb *p, const struct sched_attr *attr)
{
    struct sched_dl_entity *dl_se = &p->dl;

    init_dl_task_timer(dl_se);
    dl_se->dl_runtime = attr->sched_runtime;
    dl_se->dl_deadline = attr->sched_deadline;
    dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
    dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
    dl_se->dl_throttled = 0;
    dl_se->dl_new = 1;
}

/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct tcb *p,
               const struct sched_attr *attr)
{
    int policy = attr->sched_policy;

    if (policy == -1) /* setparam */
        policy = p->policy;

    p->policy = policy;
    if (dl_policy(policy))
        __setparam_dl(p, attr);
    p->prio = attr->sched_priority;

    if (dl_policy(p->policy))
        p->sched_class = &dl_sched_class;
    else if (rt_prio(p->prio))
        p->sched_class = &rt_sched_class;
    else
        p->sched_class = &fair_sched_class;

    set_load_weight(p);
}

/*
 * This function validates the new parameters of a -deadline task.
 * We ask for the deadline not being zero, and greater or equal
 * than the runtime, as well as the period of being zero or
 * greater than deadline. Furthermore, we have to be sure that
 * user parameters are above the internal resolution (1us); we
 * check sched_runtime only since it is always the smaller one.
 */
static bool
__checkparam_dl(const struct sched_attr *attr)
{
    return attr && attr->sched_deadline != 0 &&
        (attr->sched_period == 0 ||
        (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
        (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
        attr->sched_runtime >= (2 << (DL_SCALE - 1));
}

static int __sched_setscheduler(struct tcb *p,
                const struct sched_attr *attr)
{
    int  oldprio, oldpolicy = -1, on_rq, running;
    int policy = attr->sched_policy;
    unsigned long flags;
    const struct sched_class *prev_class;
    struct rq *rq;


    /* may grab non-irq protected spin_locks */
    BUG_ON(in_interrupt());
recheck:
    /* double check policy once rq lock held */
    if (policy < 0) {
        policy = oldpolicy = p->policy;
    } else {
        if (policy != SEMINIX_SCHED_DEADLINE &&
                policy != SEMINIX_SCHED_FIFO && policy != SEMINIX_SCHED_RR &&
                policy != SEMINIX_SCHED_NORMAL && policy != SEMINIX_SCHED_BATCH &&
                policy != SEMINIX_SCHED_IDLE)
            return -EINVAL;
    }

    if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
        (rt_policy(policy) != (attr->sched_priority != 0)))
        return -EINVAL;

    /*
     * make sure no PI-waiters arrive (or leave) while we are
     * changing the priority of the task:
     *
     * To be able to change p->policy safely, the appropriate
     * runqueue lock must be held.
     */
    rq = task_rq_lock(p, &flags);

    /*
     * Changing the policy of the stop threads its a very bad idea
     */
    if (p == rq->stop) {
        task_rq_unlock(rq, p, &flags);
        return -EINVAL;
    }

    /*
     * If not changing anything there's no need to proceed further:
     */
    if (unlikely(policy == p->policy)) {
        if (fair_policy(policy) && (int)attr->sched_priority != p->prio)
            goto change;
        if (rt_policy(policy) && (int)attr->sched_priority != p->prio)
            goto change;
        if (dl_policy(policy))
            goto change;

        task_rq_unlock(rq, p, &flags);
        return 0;
    }
change:
    /* recheck policy now with rq lock held */
    if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
        policy = oldpolicy = -1;
        task_rq_unlock(rq, p, &flags);
        goto recheck;
    }

    /*
     * If setscheduling to SCHED_DEADLINE (or changing the parameters
     * of a SCHED_DEADLINE task) we need to check if enough bandwidth
     * is available.
     */
    if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
        task_rq_unlock(rq, p, &flags);
        return -EBUSY;
    }

    on_rq = p->on_rq;
    running = task_current(rq, p);
    if (on_rq)
        dequeue_task(rq, p, 0);
    if (running)
        p->sched_class->put_prev_task(rq, p);

    oldprio = p->prio;
    prev_class = p->sched_class;
    __setscheduler(rq, p, attr);

    if (running)
        p->sched_class->set_curr_task(rq);
    if (on_rq)
        enqueue_task(rq, p, 0);

    check_class_changed(rq, p, prev_class, oldprio);
    task_rq_unlock(rq, p, &flags);

    return 0;
}

/**
 * sched_set_scheduler - change the scheduling policy and/or RT priority of a thread.
 * @p: the task in question.
 * @policy: new policy.
 * @param: structure containing the new RT priority.
 *
 * Return: 0 on success. An error code otherwise.
 *
 * NOTE that the task may be already dead.
 */
int sched_set_scheduler(struct tcb *p, int policy, int priority)
{
    struct sched_attr attr = {
        .sched_policy   = policy,
        .sched_priority = priority,
    };

    return __sched_setscheduler(p, &attr);
}

int sched_setattr(struct tcb *p, const struct sched_attr *attr)
{
    return __sched_setscheduler(p, attr);
}

void sched_get_sched_attr(struct tcb *p, struct sched_attr *attr)
{
    if (!attr || !p)
        return;

    get_task_struct(p);

    attr->sched_policy = p->policy;
    attr->sched_priority = p->prio;

    if (dl_policy(attr->sched_policy)) {
        struct sched_dl_entity *dl_se = &p->dl;

        attr->sched_runtime = dl_se->dl_runtime;
        attr->sched_deadline = dl_se->dl_deadline;
        attr->sched_period = dl_se->dl_period;
    }
}

int sched_rr_getinterval(struct tcb *p, struct timespec *interval)
{
    unsigned int time_slice;
    unsigned long flags;
    struct rq *rq;

    if (!p)
        return -ESRCH;

    if (!interval)
        return -EINVAL;

    get_task_struct(p);
    rq = task_rq_lock(p, &flags);
    time_slice = 0;
    if (p->sched_class->get_rr_interval)
        time_slice = p->sched_class->get_rr_interval(rq, p);
    task_rq_unlock(rq, p, &flags);
    put_task_struct(p);
    jiffies_to_timespec(time_slice, interval);
    return 0;
}

#ifdef CONFIG_SMP
static void __migrate_task(struct tcb *p, int src_cpu, int dest_cpu)
{
    struct rq *rq_dest, *rq_src;

    rq_src = cpu_rq(src_cpu);
    rq_dest = cpu_rq(dest_cpu);

    raw_spin_lock(&p->pi_lock);
    double_rq_lock(rq_src, rq_dest);
    /* Already moved. */
    if (task_cpu(p) != src_cpu)
        goto done;

    /*
     * If we're not on a rq, the next wake-up will ensure we're
     * placed properly.
     */
    if (p->on_rq) {
        dequeue_task(rq_src, p, 0);
        __set_task_cpu(p, dest_cpu);
        enqueue_task(rq_dest, p, 0);
        check_preempt_curr(rq_dest, p, 0);
    }
done:
    double_rq_unlock(rq_src, rq_dest);
    raw_spin_unlock(&p->pi_lock);
}

struct migration_arg {
    struct tcb *task;
    int dest_cpu;
    struct completion  done;
};

static void migration_cpu(void *data)
{
    struct migration_arg *arg = data;

    assert(irqs_disabled());
    __migrate_task(arg->task, smp_processor_id(), arg->dest_cpu);
    complete(&arg->done);
}

static int set_cpu_allowed_ptr(struct tcb *p, int dest_cpu)
{
    unsigned long flags;
    struct rq *rq;

    rq = task_rq_lock(p, &flags);
    if (p->on_rq) {
        int ret;
        struct migration_arg arg = { .task = p, .dest_cpu = dest_cpu, };
        init_completion(&arg.done);
        task_rq_unlock(rq, p, &flags);
        smp_call_function_single(cpu_of(rq), migration_cpu, &arg, 0);
        ret = wait_for_completion_interruptible(&arg.done);
        return ret;
    } else
        __set_task_cpu(p, dest_cpu);
    task_rq_unlock(rq, p, &flags);
    return 0;
}
#else
static inline int set_cpu_allowed_ptr(struct tcb *p, int dest_cpu)
{
    return 0;
}
#endif

int sched_set_affinity(struct tcb *p, int cpu)
{
    int ret;

    if (!p || !cpu_online(cpu))
        return -EINVAL;

    /* Prevent p going away */
    get_task_struct(p);
    ret = set_cpu_allowed_ptr(p, cpu);
    put_task_struct(p);
    return ret;
}

static void do_sched_yield(void)
{
    struct rq *rq = this_rq_lock();

    current->sched_class->yield_task(rq);

    preempt_disable();
    spin_unlock(&rq->lock);
    preempt_enable_no_resched();

    schedule();
}

/**
 * yield - yield the current processor to other threads.
 *
 * This is a shortcut for kernel-space yielding - it marks the
 * thread runnable and calls sys_sched_yield().
 */
void __sched yield(void)
{
    set_current_state(TASK_RUNNING);
    do_sched_yield();
}

#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

void sched_show_task(struct tcb *p)
{
    unsigned state;

    state = p->state ? __ffs(p->state) + 1 : 0;
    printk(KERN_INFO "%-15.15s %c", p->comm,
        state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
    if (state == TASK_RUNNING)
        printk(KERN_CONT " running  ");
    else
        printk(KERN_CONT " %08lx ", thread_saved_pc(p));
#else
    if (state == TASK_RUNNING)
        printk(KERN_CONT "  running task    ");
    else
        printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
    printk(KERN_CONT " %6d 0x%08lx\n",
        p->tid, (unsigned long)task_thread_info(p)->flags);

    show_stack(p, NULL);
}

void show_state_filter(unsigned long state_filter)
{
    struct tcb *t;

#if BITS_PER_LONG == 32
    printk(KERN_INFO
        "  task                PC stack   pid father\n");
#else
    printk(KERN_INFO
        "  task                        PC stack   pid father\n");
#endif
    do_each_thread(t) {
        if (!state_filter || (t->state & state_filter))
            sched_show_task(t);
    } while_each_thread(t);
}

void dump_cpu_task(int cpu)
{
    pr_info("Task dump for CPU %d:\n", cpu);
    sched_show_task(cpu_curr(cpu));
}

/**
 * init_idle - set up an idle thread for a given CPU
 * @idle: task in question
 * @cpu: cpu the idle task belongs to
 *
 * NOTE: this function does not set the idle thread's NEED_RESCHED
 * flag, to make booting more robust.
 */
void __init init_idle(struct tcb *idle, int cpu)
{
    struct rq *rq = cpu_rq(cpu);

    __set_task_cpu(idle, cpu);

    idle->prio = MAX_PRIO;
    idle->mcprio = MAX_PRIO;
    idle->policy = SEMINIX_SCHED_IDLE;
    sched_new(idle, TASK_RUNNING);
    idle->se.exec_start = sched_clock();

    idle->flags |= PF_IDLE;

    rq->curr = rq->idle = idle;
#ifdef CONFIG_SMP
    idle->on_cpu = 1;
#endif

    /* Set the preempt count _outside_ the spinlocks! */
    init_idle_preempt_count(idle, cpu);

    /*
     * The idle tasks have their own, simple scheduling class:
     */
    idle->sched_class = &idle_sched_class;
    set_load_weight(idle);
#if defined(CONFIG_SMP)
    sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}

void __init sched_init_smp(void)
{
    sched_init_granularity();
}

void __init sched_init(void)
{

    int i, j;

    init_rt_bandwidth(&def_rt_bandwidth,
            global_rt_period(), global_rt_runtime());
    init_dl_bandwidth(&def_dl_bandwidth,
            global_rt_period(), global_rt_runtime());

    for_each_possible_cpu(i) {
        struct rq *rq;

        rq = cpu_rq(i);
        spin_lock_init(&rq->lock);
        rq->nr_running = 0;
        init_cfs_rq(&rq->cfs);
        init_rt_rq(&rq->rt, rq);
        init_dl_rq(&rq->dl, rq);

        rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

        for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
            rq->cpu_load[j] = 0;

        rq->last_load_update_tick = jiffies_64;

#ifdef CONFIG_SMP
        rq->cpu = i;
#endif
        rq->last_sched_tick = 0;
        init_rq_hrtick(rq);
    }
    enter_lazy_tlb(&init_mm, current);

    /*
     * Make us the idle thread. Technically, schedule() should not be
     * called from this thread, however somewhere below it might be,
     * but because we are the idle thread, we just pick up running again
     * when this runqueue becomes "idle".
     */
    init_idle(current, smp_processor_id());
}

static int sched_rt_global_constraints(void)
{
    unsigned long flags;
    int i, ret = 0;

    raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
    for_each_possible_cpu(i) {
        struct rt_rq *rt_rq = &cpu_rq(i)->rt;

        raw_spin_lock(&rt_rq->rt_runtime_lock);
        rt_rq->rt_runtime = global_rt_runtime();
        raw_spin_unlock(&rt_rq->rt_runtime_lock);
    }
    raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

    return ret;
}

static int sched_dl_global_constraints(void)
{
    u64 runtime = global_rt_runtime();
    u64 period = global_rt_period();
    u64 new_bw = to_ratio(period, runtime);
    int cpu, ret = 0;
    unsigned long flags;

    /*
     * Here we want to check the bandwidth not being set to some
     * value smaller than the currently allocated bandwidth in
     * any of the root_domains.
     *
     * FIXME: Cycling on all the CPUs is overdoing, but simpler than
     * cycling on root_domains... Discussion on different/better
     * solutions is welcome!
     */
    for_each_possible_cpu(cpu) {
        struct dl_bw *dl_b = dl_bw_of(cpu);

        raw_spin_lock_irqsave(&dl_b->lock, flags);
        if (new_bw < dl_b->total_bw)
            ret = -EBUSY;
        raw_spin_unlock_irqrestore(&dl_b->lock, flags);

        if (ret)
            break;
    }

    return ret;
}

static void sched_dl_do_global(void)
{
    u64 new_bw = -1;
    int cpu;
    unsigned long flags;

    def_dl_bandwidth.dl_period = global_rt_period();
    def_dl_bandwidth.dl_runtime = global_rt_runtime();

    if (global_rt_runtime() != RUNTIME_INF)
        new_bw = to_ratio(global_rt_period(), global_rt_runtime());

    /*
     * FIXME: As above...
     */
    for_each_possible_cpu(cpu) {
        struct dl_bw *dl_b = dl_bw_of(cpu);

        raw_spin_lock_irqsave(&dl_b->lock, flags);
        dl_b->bw = new_bw;
        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
    }
}

static int sched_rt_global_validate(void)
{
    if (sysctl_sched_rt_period <= 0)
        return -EINVAL;

    if (((u64)sysctl_sched_rt_runtime != RUNTIME_INF) &&
        (sysctl_sched_rt_runtime > (int)sysctl_sched_rt_period))
        return -EINVAL;

    return 0;
}

static void sched_rt_do_global(void)
{
    def_rt_bandwidth.rt_runtime = global_rt_runtime();
    def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}

int sched_rt_handler(unsigned long rt_runtime, unsigned long rt_period)
{
    int old_period, old_runtime;
    static DEFINE_MUTEX(mutex);
    int ret;

    mutex_lock(&mutex);
    old_period = sysctl_sched_rt_period;
    old_runtime = sysctl_sched_rt_runtime;


    ret = sched_rt_global_validate();
    if (ret)
        goto undo;

    ret = sched_rt_global_constraints();
    if (ret)
        goto undo;

    ret = sched_dl_global_constraints();
    if (ret)
        goto undo;

    sched_rt_do_global();
    sched_dl_do_global();

    if (0) {
undo:
        sysctl_sched_rt_period = old_period;
        sysctl_sched_rt_runtime = old_runtime;
    }
    mutex_unlock(&mutex);

    return ret;
}

int sched_rr_handler(void)
{

    static DEFINE_MUTEX(mutex);

    mutex_lock(&mutex);
    sysctl_sched_rr_timeslice = sysctl_sched_rr_timeslice <= 0 ?
        RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice);
    mutex_unlock(&mutex);
    return 0;
}

/* Used instead of source_load when we know the type == 0 */
static u32 weighted_cpuload(const int cpu)
{
    return cpu_rq(cpu)->load.weight;
}

/*
 * Return a low guess at the load of a migration-source cpu weighted
 * according to the scheduling class and "nice" value.
 *
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
static u32 source_load(int cpu, int type)
{
    struct rq *rq = cpu_rq(cpu);
    u32 total = weighted_cpuload(cpu);

    if (type == 0)
        return total;

    return min((u32)rq->cpu_load[type - 1], total);
}

/*
 * Return a high guess at the load of a migration-target cpu weighted
 * according to the scheduling class and "nice" value.
 */
static u32 target_load(int cpu, int type)
{
    struct rq *rq = cpu_rq(cpu);
    u32 total = weighted_cpuload(cpu);

    if (type == 0)
        return total;

    return max((u32)rq->cpu_load[type - 1], total);
}

/*
 * Return the average load per task on the cpu's run queue
 */
u32 cpu_avg_load_per_task(int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long total = weighted_cpuload(cpu);
    unsigned long n = rq->nr_running;

    return n ? total / n : SCHED_LOAD_SCALE;
}

int find_idlest_cpu(struct tcb *p, int this_cpu, u32 *load_move, int type)
{
    int cpu, local, idlest = -1;
    int load_idx = type;
    u32 load, min_load = UINT_MAX;

    for_each_online_cpu(cpu) {
        local = (this_cpu == cpu) ? 1 : 0;

        if (local)
            load = source_load(cpu, load_idx);
        else
            load = target_load(cpu, load_idx);

        if (load < min_load) {
            min_load = load;
            idlest = cpu;
        }
    }

    if (load_move)
        *load_move = min_load;

    return idlest;
}

int find_busiest_cpu(struct tcb *p, int this_cpu, u32 *load_move, int type)
{
    int cpu, local, busiest = -1;
    int load_idx = type;
    u32 load, max_load = 0;

    for_each_online_cpu(cpu) {
        local = (this_cpu == cpu) ? 1 : 0;

        if (local)
            load = target_load(cpu, load_idx);
        else
            load = source_load(cpu, load_idx);

        if (load > max_load) {
            max_load = load;
            busiest = cpu;
        }
    }

    if (load_move)
        *load_move = max_load;

    return busiest;
}


/*
 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
 * number) then we wake all the non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                 int nr_exclusive, int sync, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;

        if (curr->func(curr, mode, sync, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

/**
 * __wake_up - wake up threads blocked on a waitqueue.
 * @q: the waitqueue
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
 */
void __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    unsigned long flags;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}

/*
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
 */
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
{
    __wake_up_common(q, mode, 1, 0, NULL);
}

/**
 * __wake_up_sync - wake up threads blocked on a waitqueue.
 * @q: the waitqueue
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 *
 * The sync wakeup differs that the waker knows that it will schedule
 * away soon, so while the target thread will be woken up, it will not
 * be migrated to another CPU - ie. the two threads are 'synchronized'
 * with each other. This can prevent needless bouncing between CPUs.
 *
 * On UP it can prevent extra preemption.
 */
void
__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
    unsigned long flags;
    int sync = WF_SYNC;

    if (unlikely(!q))
        return;

    if (unlikely(!nr_exclusive))
        sync = 0;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, sync, NULL);
    spin_unlock_irqrestore(&q->lock, flags);
}

void complete(struct completion *x)
{
    unsigned long flags;

    spin_lock_irqsave(&x->wait.lock, flags);
    x->done++;
    __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
             1, 0, NULL);
    spin_unlock_irqrestore(&x->wait.lock, flags);
}

void complete_all(struct completion *x)
{
    unsigned long flags;

    spin_lock_irqsave(&x->wait.lock, flags);
    x->done += UINT_MAX / 2;
    __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
             0, 0, NULL);
    spin_unlock_irqrestore(&x->wait.lock, flags);
}

static inline long __sched
do_wait_for_common(struct completion *x, long timeout, int state)
{
    if (!x->done) {
        DECLARE_WAITQUEUE(wait, current);

        wait.flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_tail(&x->wait, &wait);
        do {
            if (state == TASK_INTERRUPTIBLE &&
                signal_pending(current)) {
                __remove_wait_queue(&x->wait, &wait);
                return -ERESTART;
            }
            __set_current_state(state);
            spin_unlock_irq(&x->wait.lock);
            timeout = schedule_timeout(timeout);
            spin_lock_irq(&x->wait.lock);
            if (!timeout) {
                __remove_wait_queue(&x->wait, &wait);
                return timeout;
            }
        } while (!x->done);
        __remove_wait_queue(&x->wait, &wait);
    }
    x->done--;
    return timeout;
}

static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
    spin_lock_irq(&x->wait.lock);
    timeout = do_wait_for_common(x, timeout, state);
    spin_unlock_irq(&x->wait.lock);
    return timeout;
}

void __sched wait_for_completion(struct completion *x)
{
    wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}

unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
    return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
}

int __sched wait_for_completion_interruptible(struct completion *x)
{
    long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
    if (t == -ERESTART)
        return t;
    return 0;
}

unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
                      unsigned long timeout)
{
    return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
}
